In [1]:
import numpy as np
import pandas as pd 

Data Reading¶

In [3]:
df=pd.read_csv(r"C:\Users\HOME\Downloads\archive (1)\PS_20174392719_1491204439457_log.csv")
In [4]:
df
Out[4]:
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 PAYMENT 9839.64 C1231006815 170136.00 160296.36 M1979787155 0.00 0.00 0 0
1 1 PAYMENT 1864.28 C1666544295 21249.00 19384.72 M2044282225 0.00 0.00 0 0
2 1 TRANSFER 181.00 C1305486145 181.00 0.00 C553264065 0.00 0.00 1 0
3 1 CASH_OUT 181.00 C840083671 181.00 0.00 C38997010 21182.00 0.00 1 0
4 1 PAYMENT 11668.14 C2048537720 41554.00 29885.86 M1230701703 0.00 0.00 0 0
... ... ... ... ... ... ... ... ... ... ... ...
6362615 743 CASH_OUT 339682.13 C786484425 339682.13 0.00 C776919290 0.00 339682.13 1 0
6362616 743 TRANSFER 6311409.28 C1529008245 6311409.28 0.00 C1881841831 0.00 0.00 1 0
6362617 743 CASH_OUT 6311409.28 C1162922333 6311409.28 0.00 C1365125890 68488.84 6379898.11 1 0
6362618 743 TRANSFER 850002.52 C1685995037 850002.52 0.00 C2080388513 0.00 0.00 1 0
6362619 743 CASH_OUT 850002.52 C1280323807 850002.52 0.00 C873221189 6510099.11 7360101.63 1 0

6362620 rows × 11 columns

image.png

In [5]:
data=df.copy()

Analysis¶

In [6]:
print(data.isnull().sum())
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64
In [7]:
# Exploring transaction type
print(data.type.value_counts())
CASH_OUT    2237500
PAYMENT     2151495
CASH_IN     1399284
TRANSFER     532909
DEBIT         41432
Name: type, dtype: int64

Vizualization¶

In [8]:
type = data["type"].value_counts()
transactions = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(data, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Distribution of Transaction Type")
figure.show()

Feature Selection¶

In [9]:
# Checking correlation
correlation = data.corr()
print(correlation["isFraud"].sort_values(ascending=False))
C:\Users\HOME\AppData\Local\Temp\ipykernel_1940\3404805963.py:2: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64
In [10]:
data["type"] = data["type"].map({"CASH_OUT": 1, "PAYMENT": 2, 
                                 "CASH_IN": 3, "TRANSFER": 4,
                                 "DEBIT": 5})
data["isFraud"] = data["isFraud"].map({0: "No Fraud", 1: "Fraud"})
print(data.head())
   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  
In [11]:
# splitting the data
from sklearn.model_selection import train_test_split
x = np.array(data[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(data[["isFraud"]])

Data Splitting¶

In [ ]:
# training a machine learning model
from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)

Model Training¶

In [ ]:
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)

Prediction/Testing¶

In [12]:
print(model.score(xtest, ytest))
0.999732814469511
In [13]:
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))
['Fraud']